• DOMAIN:
Entertainment
• CONTEXT:
Company X owns a movie application and repository which caters movie streaming to millions of users who on subscription basis. Company wants to automate the process of cast and crew information in each scene from a movie such that when a user pauses on the movie and clicks on cast information button, the app will show details of the actor in the scene. Company has an in-house computer vision and multimedia experts who need to detect faces from screen shots from the movie scene.
• TASK:
Help to create an image dataset to be used by the AI team to build an image classifier data. Profile images of people are given.
import tensorflow as tf
print("You are using TensorFlow version", tf.__version__)
if len(tf.config.list_physical_devices('GPU')) > 0:
print("You have a GPU enabled.")
else:
print("Enable a GPU before running this notebook.")
import numpy as np
import pandas as pd
import os
from tqdm.notebook import trange, tqdm
import cv2
from google.colab.patches import cv2_imshow
from IPython.display import Image, display, Markdown, clear_output
from zipfile import ZipFile
import warnings
warnings.filterwarnings("ignore")
import random
import dlib
random.seed(0)
from google.colab import drive
drive.mount('/content/drive/')
project_path = '/content/drive/MyDrive/Colab Notebooks/Project_Object_Detection/OD_Part_2'
image_files = 'Part 2 - training images.zip'
images_zip_path = os.path.join(project_path, image_files)
with ZipFile(images_zip_path, 'r') as z:
z.extractall()
## Get the Unzip Location in the drive
zip_dir_loc = z.filelist[0].filename.split("/")[0]
zip_dir_loc
raw_img_file_names = [os.path.join(zip_dir_loc,i) for i in os.listdir(zip_dir_loc)]
raw_img_file_names[:5]
img_list = []
for imgs in tqdm(raw_img_file_names):
tst_img = cv2.imread(imgs)
img_list.append(tst_img)
img_list = np.array(img_list)
display(Markdown(f"#### {img_list.shape}"))
1091 images in the folder.600 X 600 RGB image.nums = 5
img = np.zeros((224,1,3))
for i in range(1,nums+1):
image = cv2.resize(img_list[i],(224,224))
img = np.concatenate([img,image],axis=1)
if i % 5 == 0:
cv2_imshow(img)
img = np.zeros((224,1,3))
cv2_imshow(img)
def test_bb(df,fname,title=""):
tst_img = cv2.imread(fname)
temp_df = df[df['Image_Name'] == fname]
rect_img = []
for rows in temp_df.index:
x = df['x'][rows]
y = df['y'][rows]
w = df['w'][rows]
h = df['h'][rows]
cv2.rectangle(tst_img,(x,y),(x+w,y+h),(255,0,0),2)
cv2.putText(tst_img, title, (int((x+w)*0.75),y-3),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255))
cv2_imshow(tst_img)
return
def show_face(img_list,scale=1.0):
for imgs in img_list:
img = cv2.imread(imgs)
img_w = int(img.shape[1]*scale)
img_h = int(img.shape[0]*scale)
img = cv2.resize(img,(img_w,img_h))
display(Markdown(f"#### {imgs}"))
cv2_imshow(img)
return
def get_metrics(df,method=''):
fp = df[df['Total_Faces'] > 1]['Image_Name'].nunique()
fn = len(df[df['w'] == -1])
tp = len(df[df['w'] != -1])
precision = tp/(tp + fp)
recall = tp/(tp + fn)
f1 = 2*(precision * recall)/(precision + recall)
return({'True_Positive':tp,
'False_Positive':fp,
'False_Negative':fn,
'Precision':precision,
'Recall':recall,
'F1':f1,
'Method':method})
- True Positives : Face correctly detected.
- False Positives : Face is not correctly detected. This means multiple faces are identified in a given image.
- False Negatives : No face detected from a given image of a face.
- Scores: Precision, Recall and F1 Score
score_board = pd.DataFrame(columns=['Method','True_Positive','False_Positive','False_Negative','Precision','Recall','F1'])
Object Detection using Haar feature-based cascade classifiers is an effective object detection method proposed by Paul Viola and Michael Jones in their paper, “Rapid Object Detection using a Boosted Cascade of Simple Features” in 2001. It is a machine learning based approach where a cascade function is trained from a lot of positive and negative images. It is then used to detect objects in other images.¶
!wget https://raw.githubusercontent.com/opencv/opencv/master/data/haarcascades/haarcascade_frontalface_default.xml
x, y, w and h from the bounding box and the file name.¶haar_img_box_df = pd.DataFrame(columns=['x','y','w','h','Total_Faces','Image_Name'])
haar_img_box_df
face_cascade = cv2.CascadeClassifier('haarcascade_frontalface_default.xml')
test_img = cv2.imread(raw_img_file_names[0])
grey = cv2.cvtColor(test_img, cv2.COLOR_BGR2GRAY)
# Detect faces
faces = face_cascade.detectMultiScale(grey,1.1,4)
# Draw rectangle around the faces
for (x, y, w, h) in faces:
cv2.rectangle(test_img, (x, y), (x+w, y+h), (255, 0, 0), 2)
cv2.putText(test_img, "HaarCascadeClassifier", (int((x+w)*0.75),y-3),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255))
# Display the output
display(Markdown(f"### Bounding Box parameters are `x`:{x}, `y`:{y}, `width`:{w}, `height`:{h}"))
cv2_imshow(test_img)
- The Haar based Cascade Classifier correctly detects the face
%%time
haar_undetected_images = []
haar_detected_images = []
for imgs, fnames in tqdm(zip(img_list,raw_img_file_names)):
gray = cv2.cvtColor(imgs,cv2.COLOR_BGR2GRAY)
faces = face_cascade.detectMultiScale(gray,1.1,4)
if len(faces) == 0:
haar_undetected_images.append(fnames)
temp_dict = {'x':0,
'y':0,
'w':-1,
'h':-1,
'Total_Faces':0,
'Image_Name':fnames}
else:
haar_detected_images.append(fnames)
for (x,y,w,h) in faces:
temp_dict = {'x':x,
'y':y,
'w':w,
'h':h,
'Total_Faces':len(faces),
'Image_Name':fnames}
haar_img_box_df = haar_img_box_df.append(temp_dict,ignore_index=True)
display(Markdown(f"#### Detected faces for {len(haar_detected_images)} images"))
display(Markdown(f"#### Failed to detect faces for {len(haar_undetected_images)} images"))
haar_img_box_df
- The algorithm has detected
930images with face or faces.- The algorithm has failed to detect
161images with a face
haar_img_box_df[haar_img_box_df['Total_Faces'] > 1]
- There are 156 rows that have detected more than 1 face. - False Positives.
display(Markdown("### (1) Correctly detected 1 face"))
test_bb(haar_img_box_df,"training_images/real_00115.jpg",title="Haar")
display(Markdown("### (2) Incorrectly detected multiple faces"))
test_bb(haar_img_box_df,"training_images/real_00730.jpg",title="Haar")
1.The algorithm might have incorrectly detected multiple faces in the 2nd image along with the correct one.
show_face(haar_undetected_images[-5:],scale=0.4)
score_board = score_board.append(get_metrics(haar_img_box_df,method='Haar'),ignore_index=True)
score_board
False Negatives and `False Positives.88% of F1 Score.It is a framework developed as a solution for both face detection and face alignment. The process consists of three stages of convolutional networks that are able to recognize faces and landmark location such as eyes, nose, and mouth. The paper proposes MTCNN as a way to integrate both tasks (recognition and alignment) using multi-task learning. In the first stage it uses a shallow CNN to quickly produce candidate windows. In the second stage it refines the proposed candidate windows through a more complex CNN. And lastly, in the third stage it uses a third CNN, more complex than the others, to further refine the result and output facial landmark positions.
mtcnn package¶!pip install mtcnn
mtcnn library and create an instance of the same.¶from mtcnn.mtcnn import MTCNN
mtcnn_det = MTCNN()
mtcnn_tst_img = cv2.imread(raw_img_file_names[0])
mt_cvt = cv2.cvtColor(mtcnn_tst_img,cv2.COLOR_BGR2RGB)
mt_faces = mtcnn_det.detect_faces(mt_cvt)
for face in mt_faces:
mt_x, mt_y,mt_w,mt_h = face['box']
cv2.rectangle(mtcnn_tst_img,(mt_x,mt_y),(mt_x + mt_w,mt_y + mt_h),(255,0,0),2)
cv2.putText(mtcnn_tst_img, "MTCNN", (int((mt_x+mt_w)*1),mt_y-3),cv2.FONT_HERSHEY_SIMPLEX,0.5,(0,0,255))
cv2_imshow(mtcnn_tst_img)
- The face has been correctly detected.
- Also, that the bounding box is rectatngular in shape unlike the Haar based algorithm.
mtcnn_img_box_df = pd.DataFrame(columns=['x','y','w','h','Total_Faces','Image_Name'])
mtcnn_img_box_df
%%time
mtcnn_undetected_images = []
mtcnn_detected_images = []
for imgs, fnames in tqdm(zip(img_list,raw_img_file_names)):
cvt_img = cv2.cvtColor(imgs,cv2.COLOR_BGR2RGB)
faces = mtcnn_det.detect_faces(cvt_img)
if len(faces) == 0:
mtcnn_undetected_images.append(fnames)
temp_dict = {'x':0,
'y':0,
'w':-1,
'h':-1,
'Total_Faces':0,
'Image_Name':fnames}
else:
mtcnn_detected_images.append(fnames)
for face in faces:
temp_dict = {'x':face['box'][0],
'y':face['box'][1],
'w':face['box'][2],
'h':face['box'][3],
'Total_Faces':len(faces),
'Image_Name':fnames}
mtcnn_img_box_df = mtcnn_img_box_df.append(temp_dict,ignore_index=True)
display(Markdown(f"#### Detected faces for {len(mtcnn_detected_images)} images"))
display(Markdown(f"#### Failed to detect faces for {len(mtcnn_undetected_images)} images"))
- The MTCNN has successfully detected face/faces in
1086images.- The MTCNN has failed to detect face for
5images unlike161in case of Haar based method. MTCNN has performed better than Haar based method but slower than Haar.
mtcnn_img_box_df
display(mtcnn_img_box_df[mtcnn_img_box_df['Total_Faces'] > 1])
display(Markdown(f"#### Number of images with more than 1 face detected : {len(mtcnn_img_box_df[mtcnn_img_box_df['Total_Faces'] > 1])}"))
display(Markdown("### (1) Correctly detected 1 face"))
test_bb(mtcnn_img_box_df,"training_images/real_00115.jpg",title="MTCNN")
display(Markdown("### (2) Correctly detected one face and incorrectly the other one"))
test_bb(mtcnn_img_box_df,"training_images/real_00699.jpg",title="MTCNN")
- The MTCNN has correctly detected face from most of the images.
- Ther are about 34 such images where more than 1 face is detected when compared to 156 in case of Haar based method.
show_face(mtcnn_undetected_images,scale=0.4)
score_board = score_board.append(get_metrics(mtcnn_img_box_df,method='MTCNN'),ignore_index=True)
score_board
- The number of false positives has reduced significantly and without any false negatives.
- The method scores a
99%of F1 score which is an im,provement of ~11% over Haar based method- There are about 5 images where a face could not be detected due to:
- Face partailly covered.
- Face zoomed such that it crops part of the face.
- Poor illumination or partially lit surfaces on the face.
- The MTCNN has detected face in images where,
- Tilted face
- Face turned away.
Bounding Box from the detector's output¶def getBB(image, rect):
x = max(0, rect.left())
y = max(0, rect.top())
x_end = min(rect.right(), image.shape[1])
y_end = min(rect.bottom(), image.shape[0])
w = x_end - x
h = y_end - y
return (x, y, w, h)
!wget "https://www.adrianbulat.com/downloads/dlib/mmod_human_face_detector.dat"
nums = 9
# detector = dlib.get_frontal_face_detector()
detector = dlib.cnn_face_detection_model_v1("mmod_human_face_detector.dat")
img = np.zeros((224,1,3))
for nic in range(1,nums+1):
image = cv2.imread(raw_img_file_names[random.randint(0,img_list.shape[0])])
image = cv2.resize(image, (224,224))
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
rects = detector(rgb, 1)
# print(len(rects))
faces = [getBB(image, r.rect) for r in rects]
for (x, y, w, h) in faces:
cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
img = np.concatenate([img,image],axis=1)
if (nic % 5 == 0):
cv2_imshow(img)
img = np.zeros((224,1,3))
cv2_imshow(img)
- All faces are correctly detected.
dlib_img_box_df = pd.DataFrame(columns=['x','y','w','h','Total_Faces','Image_Name'])
dlib_img_box_df
%%time
dlib_undetected_images = []
dlib_detected_images = []
# detector = dlib.get_frontal_face_detector()
detector = dlib.cnn_face_detection_model_v1("mmod_human_face_detector.dat")
for imgs, fnames in tqdm(zip(img_list,raw_img_file_names)):
cvt_img = cv2.cvtColor(imgs,cv2.COLOR_BGR2GRAY)
# cvt_img = cv2.resize(image, (224,224))
rects = detector(cvt_img, 1)
if len(rects) == 0:
dlib_undetected_images.append(fnames)
temp_dict = {'x':0,
'y':0,
'w':-1,
'h':-1,
'Total_Faces':0,
'Image_Name':fnames}
else:
dlib_detected_images.append(fnames)
faces = [getBB(cvt_img, r.rect) for r in rects]
for face in faces:
temp_dict = {'x':face[0],
'y':face[1],
'w':face[2],
'h':face[3],
'Total_Faces':len(faces),
'Image_Name':fnames}
dlib_img_box_df = dlib_img_box_df.append(temp_dict,ignore_index=True)
display(Markdown(f"#### Detected faces for {len(dlib_detected_images)} images"))
display(Markdown(f"#### Failed to detect faces for {len(dlib_undetected_images)} images"))
- The model has detected faces in
1090images and failed to do so for1image.- This the best we have seen between 3 diefferent methods.
dlib_img_box_df
display(dlib_img_box_df[dlib_img_box_df['Total_Faces'] > 1])
display(Markdown(f"#### Number of images with more than 1 face detected : {len(dlib_img_box_df[dlib_img_box_df['Total_Faces'] > 1])}"))
display(Markdown("### (1) Correctly detected 1 face"))
test_bb(dlib_img_box_df,"training_images/real_00808.jpg",title="Dlib")
display(Markdown("### (2) Incorrectly detected 2 face"))
test_bb(dlib_img_box_df,"training_images/real_00041.jpg",title="Dlib")
display(Markdown("### (2) Face not detected "))
show_face(dlib_undetected_images)
score_board = score_board.append(get_metrics(dlib_img_box_df,method="Dlib-CNN"),ignore_index=True)
score_board
- As we could see that the number of false positives are far lesser when compared to MTCNN i.e 3 where more than one face was detected. - False positives without any false negatives.
- The method has an F1 score of ~99.8% which is 0.6% better than MTCNN approach.
haar_set = set(haar_undetected_images)
mtcnn_set = set(mtcnn_undetected_images)
dlib_set = set(dlib_undetected_images)
# haar_set.intersection(mtcnn_set) , dlib_set.intersection(haar_set), mtcnn_set.intersection(dlib_set)
show_face(haar_set.intersection(mtcnn_set),0.4)
show_face(dlib_set.intersection(haar_set),0.4)
show_face(mtcnn_set.intersection(dlib_set),0.4)
- Face partailly covered.
- Face tilted such that the frontal face is not completely visible.
- Face turned away such that the frontal is not completely visible.
- Face zoomed such that it crops part of the face.
- Poor illumination or partially lit surfaces on the face.